###################
#load libraries
###################
library(dplyr)
## 
## Attache Paket: 'dplyr'
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Seurat)
## Attaching SeuratObject
library(patchwork)
library(ggplot2)

Setup the Seurat Object

############################
#Setup the Seurat Object
############################

# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "./data_new/002/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
## An object of class Seurat 
## 22302 features across 11922 samples within 1 assay 
## Active assay: RNA (22302 features, 0 variable features)

Standard pre-processing workflow

QC and selecting cells for further analysis

  • Visualize QC metrics as a violin plot
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

  • FeatureScatter is typically used to visualize feature-feature relationships, but can be used for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

pbmc <- subset(pbmc, subset = nFeature_RNA > 1000 & nFeature_RNA < 4000 & percent.mt < 15)

Normalizing the data

#pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- NormalizeData(pbmc)

Identification of highly variable features (feature selection)

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6 rows containing missing values (geom_point).

Scaling the data

all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix

Perform linear dimensional reduction

pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1 
## Positive:  LTB, CD3E, IL32, CD3D, TRAC, PCED1B-AS1, CD3G, TRBC2, ETS1, MALAT1 
##     BCL11B, IL7R, ARL4C, CD2, LIME1, TCF7, CD7, LINC00861, PRKCQ-AS1, CD27 
##     FCMR, CCR7, PIK3IP1, CD247, TRBC1, GZMM, LEF1, NOSIP, ISG20, BCL2 
## Negative:  FGL2, FCN1, LYZ, CST3, MNDA, CTSS, TYMP, CYBB, SERPINA1, IFI30 
##     PSAP, NCF2, S100A9, LST1, TYROBP, AIF1, CSTA, MPEG1, TNFSF13B, TMEM176B 
##     FTL, DUSP6, CD68, MS4A6A, S100A8, DUSP1, FOS, GRN, AC020656.1, CD36 
## PC_ 2 
## Positive:  MS4A1, CD79A, IGHM, BANK1, CD79B, LINC00926, RALGPS2, TNFRSF13C, IGHD, NIBAN3 
##     CD22, HLA-DQA1, IGKC, BCL11A, AFF3, LINC02397, SPIB, PAX5, VPREB3, FCRL1 
##     BLK, FCER2, CD74, HLA-DRA, SWAP70, HLA-DRB1, COBLL1, HLA-DOB, BLNK, P2RX5 
## Negative:  IL32, CD3E, GZMM, CD3D, CTSW, CD2, CD247, ANXA1, CD3G, CD7 
##     GZMA, BCL11B, S100A4, NKG7, CST7, TRAC, PRF1, LINC00861, TMSB4X, GNLY 
##     CCL5, ARL4C, KLRK1, KLRD1, S100A10, SAMD3, IL7R, ID2, FGFBP2, TRBC1 
## PC_ 3 
## Positive:  IL7R, CCR7, LEF1, TCF7, MAL, RCAN3, TRABD2A, PIK3IP1, NOSIP, CD27 
##     OXNAD1, CAMK4, LTB, PRKCQ-AS1, TRAC, PDE3B, CHRM3-AS2, BCL11B, FHIT, LRRN3 
##     VIM, RGCC, NELL2, PASK, CD3G, CD3D, TRAT1, AQP3, ABLIM1, S100A12 
## Negative:  NKG7, GNLY, GZMB, KLRD1, CST7, PRF1, FGFBP2, GZMA, KLRF1, CCL4 
##     HOPX, SPON2, CLIC3, GZMH, CCL5, ADGRG1, FCGR3A, PTGDR, CD160, TBX21 
##     XCL2, MATK, IL2RB, C12orf75, TRDC, LAIR2, CTSW, PLEK, S1PR5, FCRL6 
## PC_ 4 
## Positive:  CDKN1C, HES4, TCF7L2, FCGR3A, CTSL, BATF3, MS4A7, SIGLEC10, CSF1R, CKB 
##     MTSS1, CASP5, IFITM3, AC104809.2, RRAS, CALML4, SMIM25, MS4A4A, ABI3, AC064805.1 
##     RHOC, CAMK1, NEURL1, CALHM6, NAP1L1, GPBAR1, HMOX1, HCAR3, ZNF703, LY6E 
## Negative:  S100A12, SLC2A3, VCAN, PADI4, CYP1B1, ALOX5AP, ITGAM, CES1, MGST1, S100A8 
##     RNASE6, MEGF9, METTL9, QPCT, MCEMP1, VNN2, THBS1, PGD, CTSD, RBP7 
##     CD14, RNASE2, BST1, AC020916.1, F5, PLBD1, MARC1, CD36, CKAP4, CRISPLD2 
## PC_ 5 
## Positive:  LILRA4, CLEC4C, SERPINF1, LRRC26, TPM2, SCT, DNASE1L3, MAP1A, PLD4, IL3RA 
##     LINC00996, EPHB1, PTCRA, ITM2C, LAMP5, SMPD3, PACSIN1, CIB2, TNFRSF21, SCAMP5 
##     PHEX, DERL3, GAS6, PLEKHD1, SCN9A, AC097375.1, ZFAT, SMIM5, AL096865.1, NRP1 
## Negative:  MS4A1, LINC00926, CD79A, BANK1, TNFRSF13C, IGHD, CD79B, CD22, FCRL1, RALGPS2 
##     PAX5, VPREB3, FCRL3, GNLY, FCER2, KLRD1, LINC02397, CCL4, FGFBP2, PRF1 
##     CST7, HLA-DOB, NKG7, HOPX, P2RX5, ADAM28, GZMA, KLRF1, SWAP70, GZMH
# Examine and visualize PCA results a few different ways
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  LTB, CD3E, IL32, CD3D, TRAC 
## Negative:  FGL2, FCN1, LYZ, CST3, MNDA 
## PC_ 2 
## Positive:  MS4A1, CD79A, IGHM, BANK1, CD79B 
## Negative:  IL32, CD3E, GZMM, CD3D, CTSW 
## PC_ 3 
## Positive:  IL7R, CCR7, LEF1, TCF7, MAL 
## Negative:  NKG7, GNLY, GZMB, KLRD1, CST7 
## PC_ 4 
## Positive:  CDKN1C, HES4, TCF7L2, FCGR3A, CTSL 
## Negative:  S100A12, SLC2A3, VCAN, PADI4, CYP1B1 
## PC_ 5 
## Positive:  LILRA4, CLEC4C, SERPINF1, LRRC26, TPM2 
## Negative:  MS4A1, LINC00926, CD79A, BANK1, TNFRSF13C
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")

DimPlot(pbmc, reduction = "pca")

DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)

Determine the ‘dimensionality’ of the dataset

# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time
pbmc <- JackStraw(pbmc, num.replicate = 100, dims = 50)
pbmc <- ScoreJackStraw(pbmc, dims = 1:50)
JackStrawPlot(pbmc, dims = 1:40)
## Warning: Removed 61657 rows containing missing values (geom_point).

ElbowPlot(pbmc, ndims = 50)

Cluster the cells

pbmc <- FindNeighbors(pbmc, dims = 1:36)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 10877
## Number of edges: 427932
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9139
## Number of communities: 17
## Elapsed time: 2 seconds
# Look at cluster IDs of the first 5 cells
head(Idents(pbmc), 5)
## AAACCCAAGGCCCAAA-1 AAACCCAAGTAATACG-1 AAACCCAAGTCACACT-1 AAACCCACAAAGCGTG-1 
##                  7                  0                  0                  1 
## AAACCCACAATCGAAA-1 
##                  6 
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16

Run non-linear dimensional reduction (UMAP/tSNE)

# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
pbmc <- RunUMAP(pbmc, dims = 1:36)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 20:39:29 UMAP embedding parameters a = 0.9922 b = 1.112
## 20:39:29 Read 10877 rows and found 36 numeric columns
## 20:39:29 Using Annoy for neighbor search, n_neighbors = 30
## 20:39:29 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 20:39:31 Writing NN index file to temp file C:\Users\Daniel\AppData\Local\Temp\RtmpuMS6TS\file3f8413750d
## 20:39:31 Searching Annoy index using 1 thread, search_k = 3000
## 20:39:34 Annoy recall = 100%
## 20:39:34 Commencing smooth kNN distance calibration using 1 thread
## 20:39:35 Initializing from normalized Laplacian + noise
## 20:39:35 Commencing optimization for 200 epochs, with 470104 positive edges
## 20:39:47 Optimization finished
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc, reduction = "umap")

#UMAPPlot(pbmc)
#saveRDS(pbmc, file = "./pbmc_tutorial.rds")
#run TSNE
pbmc <- RunTSNE(pbmc, dims = 1:36)
TSNEPlot(pbmc)

Finding differentially expressed features (cluster biomarkers)

  • find all markers of cluster 2
# find all markers of cluster 2
cluster2.markers <- FindMarkers(pbmc, ident.1 = 2, min.pct = 0.25)
head(cluster2.markers, n = 5)
  • find all markers distinguishing cluster 5 from clusters 0 and 3
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
  • find markers for every cluster compared to all remaining cells, report only the positive ones
# find markers for every cluster compared to all remaining cells, report only the positive
# ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
pbmc.markers %>%
    group_by(cluster) %>%
    slice_max(n = 2, order_by = avg_log2FC) -> top2
top_gene <- top2$gene[seq(1, nrow(top2), 2)]
top2
#cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
VlnPlot(pbmc, features = top_gene, ncol = 3)

  • raw counts
# you can plot raw counts as well
VlnPlot(pbmc, features = top_gene, slot = "counts", log = TRUE, ncol = 3)

  • Feature plot
FeaturePlot(pbmc, features = top_gene, ncol = 3)

FeaturePlot(pbmc, features = top_gene, reduction = "tsne", ncol = 3)

  • DoHeatmap
pbmc.markers %>%
    group_by(cluster) %>%
    top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()

Assigning cell type identity to clusters

# new.cluster.ids <- c("Naive CD4 T", "CD14+ Mono", "Memory CD4 T", "B", "CD8 T", "FCGR3A+ Mono",
#    "NK", "DC", "Platelet")
#names(new.cluster.ids) <- levels(pbmc)
#pbmc <- RenameIdents(pbmc, new.cluster.ids)
#DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()